library(polars)
## Warning: package 'polars' was built under R version 4.3.2
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
library(microbenchmark)
## Warning: package 'microbenchmark' was built under R version 4.3.2
perf_7mb_file <- microbenchmark(
tidy = readr::read_csv("bse_compiled_2023_04_13.csv"),
polars = pl$read_csv("bse_compiled_2023_04_13.csv")
)
perf_7mb_file
autoplot(perf_7mb_file)
Reading as Polars and then converting to tidy
data frame
perf_7mb_df <- microbenchmark(
tidy = readr::read_csv("bse_compiled_2023_04_13.csv"),
polars = pl$read_csv("bse_compiled_2023_04_13.csv") |> as.data.frame()
)
perf_7mb_df
autoplot(perf_7mb_df)
Reading as Polars and then converting to tidy
data frame and taking head
perf_testing_fn <- function(data_file){
perf_data <<- microbenchmark(
tidy = readr::read_csv(data_file) %>% head(n=10),
polars = pl$read_csv(data_file) |> as.data.frame() %>%
head(n=10),
polars_head = pl$read_csv(data_file)$head(n=10) |>
as.data.frame(),
polars_lazy_head = pl$scan_csv(data_file)$head(n=10)$collect(),
polars_lazy_head_df = pl$scan_csv(data_file)$head(n=10) |> as.data.frame()
)
return( perf_data)
}
perf_testing_fn("bse_compiled_2023_04_13.csv")
autoplot(perf_data)
Reading as Polars and then converting to tidy
data frame and taking head
perf_testing_fn("T_ONTIME_REPORTING.csv")
autoplot(perf_data)
Data
pl$read_csv("T_ONTIME_REPORTING.csv") |> as.data.frame()
pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by("ORIGIN_CITY_NAME")$
agg(pl$col("DEST_CITY_NAME")$n_unique())
| ORIGIN_CITY_NAME | DEST_CITY_NAME |
|---|---|
| str | u32 |
| "New York, NY" | 64 |
pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts"))
| ORIGIN_CITY_NAME | DEST_CITY_NAME | counts |
|---|---|---|
| str | str | u32 |
| "New York, NY" | "Indianapolis, … | 163 |
| "New York, NY" | "Orlando, FL" | 404 |
| "New York, NY" | "Denver, CO" | 131 |
| "New York, NY" | "Minneapolis, M… | 124 |
| "New York, NY" | "San Antonio, T… | 31 |
| "New York, NY" | "Los Angeles, C… | 814 |
| "New York, NY" | "Dallas/Fort Wo… | 174 |
| "New York, NY" | "Washington, DC… | 330 |
| "New York, NY" | "Milwaukee, WI" | 62 |
| "New York, NY" | "Eagle, CO" | 31 |
| "New York, NY" | "Aguadilla, PR" | 62 |
| "New York, NY" | "Burbank, CA" | 31 |
| "New York, NY" | "Norfolk, VA" | 151 |
| "New York, NY" | "Chicago, IL" | 261 |
| "New York, NY" | "Cincinnati, OH… | 110 |
| "New York, NY" | "Seattle, WA" | 210 |
| "New York, NY" | "Jacksonville, … | 157 |
| "New York, NY" | "Palm Springs, … | 18 |
| "New York, NY" | "Baltimore, MD" | 79 |
| "New York, NY" | "San Juan, PR" | 283 |
| … | … | … |
| "New York, NY" | "Tampa, FL" | 222 |
| "New York, NY" | "Atlanta, GA" | 301 |
| "New York, NY" | "Salt Lake City… | 187 |
| "New York, NY" | "Portland, ME" | 103 |
| "New York, NY" | "Boston, MA" | 532 |
| "New York, NY" | "Columbus, OH" | 182 |
| "New York, NY" | "Nashville, TN" | 171 |
| "New York, NY" | "Burlington, VT… | 171 |
| "New York, NY" | "San Francisco,… | 556 |
| "New York, NY" | "Austin, TX" | 190 |
| "New York, NY" | "San Diego, CA" | 173 |
| "New York, NY" | "Worcester, MA" | 92 |
| "New York, NY" | "Las Vegas, NV" | 246 |
| "New York, NY" | "Richmond, VA" | 79 |
| "New York, NY" | "Detroit, MI" | 206 |
| "New York, NY" | "Syracuse, NY" | 141 |
| "New York, NY" | "Charlotte, NC" | 276 |
| "New York, NY" | "Buffalo, NY" | 282 |
| "New York, NY" | "Santa Ana, CA" | 31 |
| "New York, NY" | "West Palm Beac… | 222 |
# `pl$count()` is deprecated and will be removed in 0.15.0. Use `pl$len()`
read_csv("T_ONTIME_REPORTING.csv") %>%
filter(ORIGIN == "JFK") %>%
group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>%
summarise(counts = n())
perf_mainpulation <- microbenchmark(
polars = pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts")),
polars_df = pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts")) |> as.data.frame(),
polars_lazy = pl$scan_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts"))$collect(),
polars_lazy_df = pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts")) |> as.data.frame(),
tidy = read_csv("T_ONTIME_REPORTING.csv") %>%
filter(ORIGIN == "JFK") %>%
group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>%
summarise(counts = n())
)
perf_mainpulation
autoplot(perf_mainpulation)
library(profvis)
profvis({
pl$read_csv("bse_compiled_2023_04_13.csv") |>
as.data.frame()
})
profvis({
read_csv("bse_compiled_2023_04_13.csv")
})
profvis({
pl$read_csv("T_ONTIME_REPORTING.csv") |>
as.data.frame()
})
profvis({
read_csv("T_ONTIME_REPORTING.csv")
})
profvis({
pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts"))
})
profvis({
pl$read_csv("T_ONTIME_REPORTING.csv")$
filter(pl$col("ORIGIN") == "JFK")$
group_by(c("ORIGIN_CITY_NAME","DEST_CITY_NAME"))$
agg(pl$len()$alias("counts")) |>
as.data.frame()
})
tidy approach
profvis({
read_csv("T_ONTIME_REPORTING.csv") %>%
filter(ORIGIN == "JFK") %>%
group_by(ORIGIN_CITY_NAME,DEST_CITY_NAME) %>%
summarise(counts = n())
})